package com.shujia.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo08GroupBy {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo08GroupBy")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)
    // 统计性别人数
    val stuLineRDD: RDD[String] = sc.textFile("Spark/data/students.txt")

    val groupRDD: RDD[(String, Iterable[String])] = stuLineRDD
      .map(line => line.split(",")(3))

      /**
       * groupBy：转换算子
       * 按照指定的字段进行分组
       */
      .groupBy(gender => gender)

    groupRDD
      .map(kv => s"${kv._1},${kv._2.size}")
      .foreach(println)

  }

}
